*To see the results, please pull down to the bottom of the file.
# import packages:
import pandas as pd
import numpy as np
import re
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt
import random
Go through the data in general and delete the very obvious meaningless data manually. Then, import the data into Python and rename the column name to ['self', 'classmates', 'instructors'] for column1, column2 and column3
# Read data in from csv file:
sentiment = pd.read_csv('sentiment-data.csv', encoding = 'ansi')
# Rename the column name
sentiment.columns = ['self','classmates','instructors']
# Use sentiment.head to glance at first ten rows in the dataset
sentiment.head(10)
| self | classmates | instructors | |
|---|---|---|---|
| 0 | Excited, Hopeful, Prepared, good | Ready | Informed |
| 1 | Unsure, confused, anxious, curious | apathetic | Excited |
| 2 | Co operations, Teamwork, communication, critic... | Teamwork | pass |
| 3 | First, team work, nervous, curious | Nervous | New |
| 4 | Interesting. New. Exciting. Develop | Interesting | Excited |
| 5 | perplexed???anxious???embarrassed???bit excited | hopefully kind | responsible |
| 6 | Novel, Unknown, Challenging, Useful | Nervous | accustomed |
| 7 | Worried, excited, self-doubt, motivated | Nervous | Hopeful |
| 8 | Excited,curious,nervous,worried | Excited | Eager |
| 9 | Excited, nervous, confused, pleased | Similarly | Normal |
def clean_self(words):
""" clean input word list
Args:
word (string): 4 words describing students's feeling towards the course
Return:
List containing parsed words.
"""
res = []
pattern = re.compile(r'[A-Za-z\ \-]+', re.I)
match = pattern.findall(words)
for i in range(len(match)):
res.append(match[i].strip().lower())
return res
# unit test for clean_word
clean_self('Helpful, intense, team-based, excited')
clean_self('First, team work, nervous, curious')
sentiment.loc[0:12, :].apply(lambda row: clean_self(row.self), axis = 1)
0 [excited, hopeful, prepared, good] 1 [unsure, confused, anxious, curious] 2 [co operations, teamwork, communication, criti... 3 [first, team work, nervous, curious] 4 [interesting, new, exciting, develop] 5 [perplexed, anxious, embarrassed, bit excited] 6 [novel, unknown, challenging, useful] 7 [worried, excited, self-doubt, motivated] 8 [excited, curious, nervous, worried] 9 [excited, nervous, confused, pleased] 10 [delightful, excited, tense, puzzled] 11 [helpful, intense, team-based, excited] 12 [scared, nervous, afraid, hard] dtype: object
The Code above works well.
Use regular expressions to match a single word in column2 and column3.
For example, if the input is 'pathetic (when examing the student's work)' then output should be 'pathetic'
def clean_single_word(words):
#print('sss',type(words))
pattern = re.compile(r'[a-zA-Z\-]+')
match = pattern.findall(words)[0].strip()
# print(match)
return match.lower()
# unit test for clean_single_word
clean_single_word('hard, cooperate, tricky, important')
clean_single_word('pathetic (when examing the student\'s work)')
clean_single_word(sentiment.loc[0, 'classmates'])
'ready'
The Code above also works well.
def clean(row):
row.self = clean_self(row.self)
row.classmates = clean_single_word(row.classmates)
row.instructors = clean_single_word(row.instructors)
# [row.classmates, row.instructors] = map(clean_single_word, [row.classmates, row.instructors])
return row
The next step is to apply the function to the dataset. In pandas, the best way to make it done is through apply methods.
sentiment_clean = sentiment.apply(clean, axis=1)
The data should have been cleaned. Export the first 20 items to see what it is like.
# see the first 20 rows cleaned data:
sentiment_clean.head(20)
| self | classmates | instructors | |
|---|---|---|---|
| 0 | [excited, hopeful, prepared, good] | ready | informed |
| 1 | [unsure, confused, anxious, curious] | apathetic | excited |
| 2 | [co operations, teamwork, communication, criti... | teamwork | pass |
| 3 | [first, team work, nervous, curious] | nervous | new |
| 4 | [interesting, new, exciting, develop] | interesting | excited |
| 5 | [perplexed, anxious, embarrassed, bit excited] | hopefully | responsible |
| 6 | [novel, unknown, challenging, useful] | nervous | accustomed |
| 7 | [worried, excited, self-doubt, motivated] | nervous | hopeful |
| 8 | [excited, curious, nervous, worried] | excited | eager |
| 9 | [excited, nervous, confused, pleased] | similarly | normal |
| 10 | [delightful, excited, tense, puzzled] | tense | amazed |
| 11 | [helpful, intense, team-based, excited] | helpful | excited |
| 12 | [scared, nervous, afraid, hard] | nervous | excited |
| 13 | [nervous, united, hard, transition] | hard | flat |
| 14 | [nervous, stressed, curious, confused] | nervous | excited |
| 15 | [anxious, nervous, excited, apprehensive] | anxious | excited |
| 16 | [cool, stressful, curiouseness, unknown] | curiouness | curiouness |
| 17 | [excited, curious, inquisitive, nervous] | excited | self-confident |
| 18 | [exciting, unknown, difficult, interesting] | exciting | unsurprising |
| 19 | [scared, nervous, worried, apprehensive] | nervous | excited |
At this point, the dataset has been cleaned. Generate new csv file for cleaned data.
# Save data to disk:
sentiment_clean.to_csv('sentiment-data-cleaned.csv')
We decided to use the word in 'self' to make a word cloud graph.
First combine all the words in 'self' to a string, then use this string and wc.generate to create the graph.
# put all words in the first column in a list
all_words = []
def extract_word(row):
for i in range(len(row.self)):
all_words.append(row.self[i])
return row
sentiment_clean.apply(extract_word, axis = 1)
| self | classmates | instructors | |
|---|---|---|---|
| 0 | [excited, hopeful, prepared, good] | ready | informed |
| 1 | [unsure, confused, anxious, curious] | apathetic | excited |
| 2 | [co operations, teamwork, communication, criti... | teamwork | pass |
| 3 | [first, team work, nervous, curious] | nervous | new |
| 4 | [interesting, new, exciting, develop] | interesting | excited |
| ... | ... | ... | ... |
| 120 | [excited, nervous, curious, difficult] | excited | excited |
| 121 | [professional, precise, academic, pragmatic] | unfamiliar | significative |
| 122 | [excited, anxious, no experience] | anxious | normal |
| 123 | [concerned, anxious, interested, curious] | interested | excited |
| 124 | [excited, anxious, accomplished, ready] | ready | ready |
125 rows × 3 columns
wc = WordCloud(
# Set the font to avoid garbled
background_color='white', # Set the background color, default is black
width=500, # Set background width
height=350, # Set background height
max_font_size=50, # Maximum font
min_font_size=10, # minimum font
mode='RGBA' # When the parameter is "RGBA" and background_color is not empty, the background is transparent
)
# combine all the words in 'self' to a string
all_words = " ".join(all_words)
# create a word cloud graph
word_cloud = wc.generate(all_words)
# Show word cloud as a picture
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
# Save the image in .png format
#wc.to_file('wc.png')
At this point the process should have ended, but we thought about how we could go further and represent all the data given, and the next interactive graph was created.
To make the graph, we need to do further processing of the data.
# A brief quantitative count of the results for each column
max_limit = 50
character_list = []
parent_list = []
value_list = []
self_word_list = []
for i in range(sentiment.shape[0]):
self_words = sentiment.iloc[i, 0]
for j in range(len(self_words)):
self_word_list.append(self_words[j])
# print(self_word_list)
self_df = pd.DataFrame({'self': self_word_list})
classmates_count = sentiment.groupby('classmates').apply(lambda df: df.shape[0]).sort_values(ascending=False).iloc[0: max_limit]
instructors_count = sentiment.groupby('instructors').apply(lambda df: df.shape[0]).sort_values(ascending=False).iloc[0: max_limit]
self_count = self_df.groupby('self').apply(lambda df: df.shape[0]).sort_values(ascending=False).iloc[0: max_limit]
# print(classmates_count)
# print(instructors_count)
# print(self_count)
# Write the required data to the list
for i in range(len(classmates_count)):
character_list.append(classmates_count.index[i] + ' ')
parent_list.append('classmates')
value_list.append(classmates_count[i])
for i in range(len(instructors_count)):
character_list.append(' ' + instructors_count.index[i])
parent_list.append('instructors')
value_list.append(instructors_count[i])
for i in range(len(self_count)):
character_list.append(self_count.index[i])
parent_list.append('self')
value_list.append(self_count[i])
parent_list.append('sentiment')
character_list.append('classmates')
value_list.append(1)
parent_list.append('sentiment')
character_list.append('self')
value_list.append(1)
parent_list.append('sentiment')
character_list.append('instructors')
value_list.append(1)
# print(len(parent_list))
# print(len(character_list))
# print(len(value_list))
# print(parent_list)
# print(character_list)
# print(value_list)
# Draw a sunburst graph
data = dict(
character = character_list,
parent = parent_list,
value = value_list)
fig = px.sunburst(
data,
names='character',
parents='parent',
values='value',
)
fig.show()
## end
You can try placing the mouse and clicking on the image, it will show more detailed information.
This is the end of the file.